#libraries needed library(vegan) library(cluster) library(pvclust) #datasets envdata<-read.csv('env.csv',header=TRUE,row.names=1) speabu<-read.csv('abund.csv',header=TRUE,row.names=1) ##Conduct non-hierarchical divisive clustering #K-means (uses the actual data (euclidean distance only) #standardize first envdata.tra<-data.stand(envdata,method='standardize',margin='column',plot=F) #now euclidean distance matrix site.euc<-vegdist(envdata.tra,method='euclidean') #determine the number of clusters using a scree plot with both dissimilarity and also silhouette width nhclus.scree(site.euc,max.k=44) #use K-means clustering specifying 8 clusters (determined from scree plot), and a high number of iterations (10000) sitecl.kmeans<-kmeans(envdata.tra,centers=8,iter.max=10000,nstart=25) #look at results sitecl.kmeans #extract specific results by referencing available components, for this example we're extracting site classifications sitecl.kmeans$cluster #can also create new object (table), with just the classification data sitecl.class<-sitecl.kmeans$cluster #if you want to see what types of data are extractable type names function names(sitecl.kmeans) #evaluating number of clusters using randomization #write a script to do a randomization test #below is the max # of clusters to consider,#of randomizations performed,#of descriptors in dataset no.clus<-10 no.ran<-999 no.var<-10 #creates empty matrix to store within SSE (sum of squared error) sse<-matrix(,nrow=no.ran+1,ncol=no.clus-1) #also creates and empty matrix to store the randomized dataset randata<-matrix(,nrow=nrow(envdata.tra),ncol=ncol(envdata.tra)) #for loop that repeats all commands between {}, a total of j times (where j is the number of clusters to consider starting with 2) for ( j in 2:no.clus){ sitecl.kmeans<-kmeans(envdata.tra,centers=j, iter.max=1000, nstart=5) sse[1,j-1]<-sum(sitecl.kmeans$withinss) for (k in 1:no.ran) { for (c in 1:no.var) { randata[,c]<-envdata.tra[,c][sample(nrow(envdata.tra))] } ran.kmeans<-kmeans(randata,centers=j,iter.max=1000,nstart=5) sse[k+1,j-1]<-sum(ran.kmeans$withinss) } } diff<-colMeans(sse)-sse[1,] plot(seq(2,no.clus,by=1),diff,xlab="No. clusters",ylab='mean(ranSSE)-actualSSE')